In this notebook we played around with number of cold iterations.



In [35]:

    
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time

read text



In [36]:

    
def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X

def read_text_whole(filename):
    with open(filename) as f:
        X = f.read()    
    X = X.replace('\n', '') 
    return X

def chop_text_to_size(text, size):
    return text[:1024*1024*size]

def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(1024*1024*size)
    X = X.replace('\n', '')
    return X

counts



In [37]:

    
def get_unicount(text):
    length = len(text)
    counts = np.zeros(26)
    for i in xrange(length):
        c = ord(text[i])
        counts[c-97]+=1
        #97-122 
    return counts

bigram statistics



In [ ]:

    
def get_bigram_stats_dic(text):        
    length = len(text)
    dic = {}
    for i in 'abcdefghijklmnopqrstuvwxyz':
        for j in 'abcdefghijklmnopqrstuvwxyz':
            dic[(i,j)]=0

    for i in xrange(length-1):                   
        if (text[i], text[i+1]) in dic:
            dic[(text[i], text[i+1])] += 1
            
    for k,v in dic.items():        
        dic[k] = v/(counts[ord(k[0])-97])
    return dic

quality



In [68]:

    
def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l

crypt



In [39]:

    
def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p

unnormalized desiredPDF (likelihood)



In [44]:

    
def get_desiredPDF_bigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)-1):         
        pr = stats[chr(permutation[ord(encrypted[i])-97]+97), 
                   chr(permutation[ord(encrypted[i+1])-97]+97)]
        if pr>0:
            logp += math.log(pr)
        else:
            logp += -9 #penalty for non existant pairs
    return logp

metropolis



In [41]:

    
def metropolis( desiredPDF, initValue, computableRVS, skipIterations = iterations ):
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        """
        next candidate for sample, generated by computableRVS
        """
        
        #acceptanceProb = min( 1, candidateDensityValue / random_variableDensityValue )
        #logp is returnd by desiredPDF_bigram, so here is the change
        acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
        
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
            
    #now when the procces is converged to desired distribution, 
    #return acceptable candidates
    print "-----"
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        
        #acceptanceProb = min( 1, candidateDensityValue / random_variableDensityValue )
        # logp is returnd by desiredPDF_bigram, so here is the change
        acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
       
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue

permutation generator and computablervs



In [43]:

    
def uniform( n ):
    
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation

def applyTransposition( basePermutation ):
    
    n = len( basePermutation )
    
    permutation = copy( basePermutation )
    
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation



In [ ]:

    
decrypt



In [45]:

    
def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)

Metropolis

various number of iterations



In [46]:

    
#TEST TEXT
fname = 'main/oliver_twist.txt'
original = read_text_words(fname, 5000)[3:]
encrypted, p = crypt(original)
#TRAIN TEXT
train_text = read_text_whole('main/war_and_peace.txt')
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)

print 'encrypting permutation: ', p
bp = np.zeros(26, dtype=int)
for i in p:
    bp[p[i]] = i
q = get_desiredPDF_bigram(bp)
print 'inverse to encrypting permutation: ', bp
print 'its likelihood: ', q
ra = uniform(26)
q = get_desiredPDF_bigram(ra)
print 'likelihood of random permutation: ', q









    



[20, 18, 25, 0, 3, 14, 21, 9, 13, 24, 12, 8, 10, 19, 2, 11, 6, 1, 22, 17, 4, 15, 23, 7, 5, 16]
[ 3 17 14  4 20 24 16 23 11  7 12 15 10  8  5 21 25 19  1 13  0  6 18 22  9
  2]
-56398.5046942
-106838.163693

250,500,1000,2000 cold iterations, best of 500



In [82]:

    
import time
iterations = [250,500,1000,2000]
qs = np.zeros(16)
for k in xrange(4):    
    j=0
    init_p = uniform(26)
    for it in iterations:
        st = time.time()
        computableGen = lambda t: applyTransposition(t)
        metropolisgenerator = \
            metropolis(get_desiredPDF_bigram, init_p, computableGen, it)
        x = []
        y = []
        for i in xrange( 500 ):
            a,b = metropolisgenerator.next() 
            x.append(a)
            y.append(b)

        et =  time.time() - st
        print 'cold iterations: ', it
        print 'metropolis time: ', et
        best = np.argmax(y)
        bestx = x[best]
        print 'best density among ', 500, ' last iterations: ', y[best]
        print 'corresponding permutation: ', bestx
        decrypted = decrypt(bestx, encrypted)
        qs[4*k+j] = quality(decrypted, original)
        print 'quality: ', qs[4*k+j] 
        j+=1
plt.plot(iterations, qs[:4], iterations, qs[4:8], iterations, qs[8:12], iterations, qs[12:16])









    



-----
cold iterations:  250
metropolis time:  39.5499999523
best density among  500  last iterations:  -71665.7306791
corresponding permutation:  [18, 14, 17, 19, 12, 10, 16, 25, 24, 13, 20, 5, 21, 7, 15, 2, 9, 0, 1, 4, 11, 22, 8, 6, 23, 3]
quality:  0.021148989899
-----
cold iterations:  500
metropolis time:  56.0929999352
best density among  500  last iterations:  -69884.1866772
corresponding permutation:  [12, 3, 14, 17, 13, 15, 25, 23, 5, 4, 6, 24, 21, 8, 22, 2, 16, 19, 20, 18, 0, 10, 11, 7, 9, 1]
quality:  0.320797258297
-----
cold iterations:  1000
metropolis time:  82.0660002232
best density among  500  last iterations:  -59755.084299
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 22, 15, 10, 8, 5, 21, 25, 18, 1, 13, 0, 6, 12, 19, 9, 2]
quality:  0.806006493506
-----
cold iterations:  2000
metropolis time:  132.069000006
best density among  500  last iterations:  -56398.5046942
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 5, 21, 25, 19, 1, 13, 0, 6, 18, 22, 9, 2]
quality:  1.0
-----
cold iterations:  250
metropolis time:  38.4229998589
best density among  500  last iterations:  -69006.8175832
corresponding permutation:  [8, 17, 19, 14, 20, 5, 16, 25, 11, 7, 1, 6, 21, 0, 12, 15, 9, 18, 24, 3, 4, 10, 13, 22, 23, 2]
quality:  0.243912337662
-----
cold iterations:  500
metropolis time:  52.1320002079
best density among  500  last iterations:  -69254.7993549
corresponding permutation:  [22, 17, 14, 4, 20, 7, 9, 23, 19, 3, 12, 15, 1, 8, 10, 21, 25, 13, 2, 18, 24, 5, 0, 11, 16, 6]
quality:  0.425009018759
-----
cold iterations:  1000
metropolis time:  77.6610000134
best density among  500  last iterations:  -70974.3067524
corresponding permutation:  [13, 8, 19, 18, 7, 12, 25, 23, 20, 0, 5, 2, 21, 11, 1, 10, 16, 3, 6, 14, 17, 22, 4, 15, 9, 24]
quality:  0.00563672438672
-----
cold iterations:  2000
metropolis time:  126.657999992
best density among  500  last iterations:  -56398.5046942
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 5, 21, 25, 19, 1, 13, 0, 6, 18, 22, 9, 2]
quality:  1.0
-----
cold iterations:  250
metropolis time:  38.7739999294
best density among  500  last iterations:  -65255.5584126
corresponding permutation:  [19, 11, 14, 4, 24, 20, 16, 23, 17, 22, 7, 15, 10, 8, 5, 21, 9, 18, 1, 13, 0, 6, 3, 12, 25, 2]
quality:  0.550955988456
-----
cold iterations:  500
metropolis time:  51.6640000343
best density among  500  last iterations:  -58318.9504693
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 18, 21, 25, 19, 1, 13, 0, 6, 5, 22, 9, 2]
quality:  0.92306998557
-----
cold iterations:  1000
metropolis time:  84.2070000172
best density among  500  last iterations:  -61092.6927378
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 18, 5, 21, 25, 19, 1, 13, 0, 6, 8, 22, 9, 2]
quality:  0.870716089466
-----
cold iterations:  2000
metropolis time:  129.15500021
best density among  500  last iterations:  -56398.5046942
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 5, 21, 25, 19, 1, 13, 0, 6, 18, 22, 9, 2]
quality:  1.0
-----
cold iterations:  250
metropolis time:  38.5729999542
best density among  500  last iterations:  -63944.25672
corresponding permutation:  [3, 17, 4, 0, 14, 24, 9, 23, 11, 7, 12, 15, 10, 8, 5, 21, 16, 19, 1, 13, 20, 6, 18, 2, 25, 22]
quality:  0.64172979798
-----
cold iterations:  500
metropolis time:  50.746999979
best density among  500  last iterations:  -59095.4862605
corresponding permutation:  [3, 17, 14, 4, 20, 24, 9, 23, 22, 7, 12, 15, 10, 8, 5, 21, 16, 19, 11, 13, 0, 6, 18, 1, 25, 2]
quality:  0.912157287157
-----
cold iterations:  1000
metropolis time:  78.1519999504
best density among  500  last iterations:  -56398.5046942
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 5, 21, 25, 19, 1, 13, 0, 6, 18, 22, 9, 2]
quality:  1.0
-----
cold iterations:  2000
metropolis time:  129.299000025
best density among  500  last iterations:  -59398.0800175
corresponding permutation:  [3, 17, 0, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 14, 5, 21, 25, 19, 1, 13, 8, 6, 18, 22, 9, 2]
quality:  0.77141955267






    Out[82]:





[<matplotlib.lines.Line2D at 0xc36bcc0>,
 <matplotlib.lines.Line2D at 0xc36bef0>,
 <matplotlib.lines.Line2D at 0xc375518>,
 <matplotlib.lines.Line2D at 0xc375a90>]



In [ ]:

    
plt.savefig('Bigram,noWordDelimiter,metrop,varyColdIters')

0 cold iterations perfomance (varying last iters):



In [83]:

    
iterations = [250,500,1000,2000]
qs = np.zeros(16)
for k in xrange(4):    
    j=0
    init_p = uniform(26)
    for it in iterations:
        st = time.time()
        computableGen = lambda t: applyedTranspostions(t)
        metropolisgenerator = \
            metropolis(get_desiredPDF_bigram, init_p, computableGen, 0)
        x = []
        y = []
        for i in xrange( it ):
            a,b = metropolisgenerator.next() 
            x.append(a)
            y.append(b)

        et =  time.time() - st
        print 'cold iterations: ', 
        print 'metropolis time: ', et
        best = np.argmax(y)
        bestx = x[best]
        print 'best density among ', it, ' last iterations: ', y[best]
        print 'corresponding permutation: ', bestx
        decrypted = decrypt(bestx, encrypted)
        qs[4*k+j] = quality(decrypted, original)
        print 'quality: ', qs[4*k+j] 
        j+=1
plt.plot(iterations, qs[:4], iterations, qs[4:8], iterations, qs[8:12], iterations, qs[12:16])









    



-----
cold iterations:  metropolis time:  12.8880000114
best density among  250  last iterations:  -70648.2551412
corresponding permutation:  [13, 3, 19, 4, 8, 15, 9, 6, 12, 22, 1, 7, 23, 0, 20, 2, 16, 18, 10, 17, 14, 5, 11, 24, 25, 21]
quality:  0.123917748918
-----
cold iterations:  metropolis time:  26.8050000668
best density among  500  last iterations:  -65316.9725865
corresponding permutation:  [3, 13, 8, 4, 11, 15, 9, 2, 12, 7, 5, 10, 25, 14, 6, 21, 16, 19, 20, 17, 0, 24, 18, 22, 23, 1]
quality:  0.492153679654
-----
cold iterations:  metropolis time:  51.4409999847
best density among  1000  last iterations:  -63746.6548882
corresponding permutation:  [3, 17, 20, 4, 0, 5, 9, 25, 11, 7, 10, 1, 24, 14, 15, 21, 23, 19, 22, 13, 8, 6, 18, 2, 16, 12]
quality:  0.577245670996
-----
cold iterations:  metropolis time:  106.414999962
best density among  2000  last iterations:  -56398.5046942
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 5, 21, 25, 19, 1, 13, 0, 6, 18, 22, 9, 2]
quality:  1.0
-----
cold iterations:  metropolis time:  13.2339999676
best density among  250  last iterations:  -70030.0127475
corresponding permutation:  [12, 13, 0, 8, 20, 24, 10, 23, 3, 19, 22, 2, 25, 4, 6, 15, 16, 5, 11, 17, 14, 21, 18, 1, 9, 7]
quality:  0.106466450216
-----
cold iterations:  metropolis time:  26.1779999733
best density among  500  last iterations:  -70034.964249
corresponding permutation:  [5, 11, 0, 14, 20, 24, 16, 23, 7, 18, 15, 2, 10, 4, 6, 9, 25, 8, 22, 3, 19, 1, 17, 13, 21, 12]
quality:  0.0572691197691
-----
cold iterations:  metropolis time:  51.9179999828
best density among  1000  last iterations:  -61853.5329716
corresponding permutation:  [22, 13, 14, 4, 20, 6, 16, 23, 11, 7, 12, 24, 10, 8, 5, 21, 25, 19, 1, 18, 0, 15, 3, 17, 9, 2]
quality:  0.686327561328
-----
cold iterations:  metropolis time:  108.29700017
best density among  2000  last iterations:  -56398.5046942
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 5, 21, 25, 19, 1, 13, 0, 6, 18, 22, 9, 2]
quality:  1.0
-----
cold iterations:  metropolis time:  12.7339999676
best density among  250  last iterations:  -72731.4355596
corresponding permutation:  [0, 18, 8, 4, 13, 12, 25, 23, 1, 5, 6, 21, 10, 20, 7, 2, 16, 14, 24, 19, 17, 22, 3, 15, 9, 11]
quality:  0.136228354978
-----
cold iterations:  metropolis time:  25.6190001965
best density among  500  last iterations:  -59911.8196339
corresponding permutation:  [3, 12, 14, 4, 20, 24, 16, 25, 11, 7, 17, 15, 10, 8, 5, 21, 23, 18, 1, 13, 0, 6, 19, 22, 9, 2]
quality:  0.762490981241
-----
cold iterations:  metropolis time:  51.6050000191
best density among  1000  last iterations:  -58286.4085992
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 22, 10, 8, 5, 21, 25, 19, 1, 13, 0, 2, 18, 15, 9, 6]
quality:  0.910984848485
-----
cold iterations:  metropolis time:  104.585999966
best density among  2000  last iterations:  -56398.5046942
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 5, 21, 25, 19, 1, 13, 0, 6, 18, 22, 9, 2]
quality:  1.0
-----
cold iterations:  metropolis time:  12.986000061
best density among  250  last iterations:  -73184.375206
corresponding permutation:  [20, 0, 18, 19, 22, 1, 25, 21, 14, 8, 5, 10, 23, 13, 15, 2, 16, 3, 24, 4, 17, 12, 11, 6, 9, 7]
quality:  0.000631313131313
-----
cold iterations:  metropolis time:  25.6989998817
best density among  500  last iterations:  -60800.1275249
corresponding permutation:  [3, 17, 14, 4, 20, 15, 16, 23, 11, 7, 21, 5, 10, 0, 22, 1, 9, 19, 12, 13, 8, 6, 18, 24, 25, 2]
quality:  0.703373015873
-----
cold iterations:  metropolis time:  53.864000082
best density among  1000  last iterations:  -70759.4506564
corresponding permutation:  [3, 14, 18, 13, 7, 12, 25, 23, 20, 8, 24, 1, 21, 17, 2, 22, 9, 19, 10, 4, 11, 5, 0, 15, 16, 6]
quality:  0.135326479076
-----
cold iterations:  metropolis time:  108.180000067
best density among  2000  last iterations:  -56398.5046942
corresponding permutation:  [3, 17, 14, 4, 20, 24, 16, 23, 11, 7, 12, 15, 10, 8, 5, 21, 25, 19, 1, 13, 0, 6, 18, 22, 9, 2]
quality:  1.0






    Out[83]:





[<matplotlib.lines.Line2D at 0xc42af28>,
 <matplotlib.lines.Line2D at 0xc43a198>,
 <matplotlib.lines.Line2D at 0xc43a780>,
 <matplotlib.lines.Line2D at 0xc43acf8>]



In [84]:

    
plt.savefig('Bigram,noWordDelimiter,metrop,varyIters')









    





<matplotlib.figure.Figure at 0xac4ff28>



In [2]:

    
import numpy as np
import matplotlib.pyplot as plt

% matplotlib inline

iterations = [250, 500, 1000, 2000]
stat_raw = np.array([
    12.8880000114, 0.123917748918,
    26.8050000668, 0.492153679654,
    51.4409999847, 0.577245670996,
    106.414999962, 1.0,

    13.2339999676, 0.106466450216,
    26.1779999733, 0.0572691197691,
    51.9179999828, 0.686327561328,
    108.29700017, 1.0,

    12.7339999676, 0.136228354978,
    25.6190001965, 0.762490981241,
    51.6050000191, 0.910984848485,
    104.585999966, 1.0,

    12.986000061, 0.000631313131313,
    25.6989998817, 0.703373015873,
    53.864000082, 0.135326479076,
    108.180000067, 1.0]).reshape((16, 2))
stat = np.array([stat_raw[::4, 1], stat_raw[1::4, 1], stat_raw[2::4, 1], stat_raw[3::4, 1]])
means = np.mean(stat, 1)
stds = np.std(stat, 1)
print(means)
print(stds)
plt.title('Dependence quality on cold iterations')
plt.xlabel('iterations')
plt.ylabel('quolity')
plt.plot(iterations, means - stds, 'r:')
plt.plot(iterations, means + stds, 'r:')
plt.plot(iterations, means, 'b-')
plt.savefig('task-2-daniel.png')









    



[ 0.09181097  0.5038217   0.57747114  1.        ]
[ 0.05369419  0.27671098  0.28221139  0.        ]



In [ ]: